
********* This program generates data with occupation, education, and industry categories used for fixed effects in the regressions *********

clear all
set more off, permanently
cd "$localdir\Data"
global output "$localdir\Output"


/***** Get raw data *****/
use registerdata19802012_us, clear 


* All variables in small letters
foreach var of varlist _all {
rename `var' `=lower("`var'") '
}

* Only use years 2003-2012
keep if year>=2003 & year<=2012


/***** Occupation fixed effects *****/
* Look at 1-digit groups (10 groups)
* Use conversion table form DST to get consistent measures
* Many missing/unspecified occupation codes: impute with most recent code since 2003 or future code up to 2012
* Use 2003 since this is the earliest year used for imputed industry codes (due to change in definitins in 2002-2003)

tab year if ocp=="" | ocp=="000000" | ocp=="999999" | ocp=="99900" // Definition of unspecified changes over time, around 3m each year
replace ocp="" if ocp=="" | ocp=="000000" | ocp=="999999" | ocp=="99900" |  ocp=="     ."
g miss_ocp=0
replace miss_ocp=1 if ocp==""
tab year miss_ocp, row // 54-56 % each year

g ocp1=substr(ocp,1,1)

tab ocp1
tab year ocp1, row

* Merge on corrections for first digit related to change of definitions 2009-2010
g ocp4=substr(ocp,1,4) // The ISCO code
sort ocp4
merge m:1 ocp4 using "Correction ISCO 1st digit"
tab _merge
codebook ocp4 if _merge==3 // Number of unique matched ocp codes = 81

tostring new_ocp1, replace
replace ocp1=new_ocp1 if _merge==3 & year>=2010 // Replace post-2009 1st digit codes, which are in the correction sheet
// All 1st digit codes now correspond to old, pre-2010, definitions, and are consistent over time  

drop _merge new_ocp1 ocp4

* Gen OK variable to find valid ocp code in the past or the future
g ok=0
replace ok=1 if ocp1!=""

* Find most recent ocp1-code since 2003 in each year
forval i=2003/2012 {
gen ok`i'=ok
replace ok`i'=0 if year>=`i'
gsort pnr -ok`i' -year 
by pnr: gen count=_n if year<=`i'-1 & ok`i'==1
gen pastocp1`i'=ocp1 if ok`i'==1 & count==1
bysort pnr (pastocp1`i'): replace pastocp1`i'=pastocp1`i'[_N] // Most recent available industry
gen temp=year if ok`i'==1 & count==1
bysort pnr: egen yearpastocp1`i'=total(temp) // Year from which most recent industry info is taken
drop ok`i' count temp
replace yearpastocp1`i'=. if yearpastocp1`i'==0
}

forval i=2003/2012 {
tab pastocp1`i'
}

* Find nearest future ocp1-code up to 2012 in each year
forval i=2003/2012 {
gen ok`i'=ok
replace ok`i'=0 if year<=`i'
gsort pnr -ok`i' year 
by pnr: gen count=_n if year>=`i'+1 & ok`i'==1
gen futureocp1`i'=ocp1 if ok`i'==1 & count==1
bysort pnr (futureocp1`i'): replace futureocp1`i'=futureocp1`i'[_N] 
gen temp=year if ok`i'==1 & count==1
bysort pnr: egen yearfutureocp1`i'=total(temp) // Year from which nearest future industry info is taken
drop ok`i' count temp
replace yearfutureocp1`i'=. if yearfutureocp1`i'==0
}


* Make one variable for past/future occupation
gen pastocp1=""
gen yearpastocp1=.
gen futureocp1=""
gen yearfutureocp1=.

forval i=2003/2012 {
replace pastocp1=pastocp1`i' if year==`i'
replace yearpastocp1=yearpastocp1`i' if year==`i'
}

forval i=2003/2012 {
replace futureocp1=futureocp1`i' if year==`i'
replace yearfutureocp1=yearfutureocp1`i' if year==`i'
}

tab year if ocp1!=""
tab year if pastocp1!=""
tab year if futureocp1!=""

* Check missings
tab year if miss_ocp==1
tab year if ocp1=="" // Should be the same

g missocp1_pastfuture=0
replace missocp1_pastfuture=1 if ocp1=="" & pastocp1=="" & futureocp1==""

tab year miss_ocp if year>=2008, row // 53-55 % missing
tab year missocp1_pastfuture if year>=2008, row // 37-38 % missing

* In the labor force = employed or unemployed 
destring primlabor, replace
g lf=0
replace lf=1 if primlabor<=37 | primlabor==40 | primlabor==95 | (primlabor>=71 & primlabor<=77)
tab year miss_ocp if year>=2008 & lf==1, row // 17-21 %
tab year missocp1_pastfuture if year>=2008 & lf==1, row // 5-6 %



/***** Education fixed effects *****/
* Variable AUDD gives highest achieved education, but this code itself is not meaningful for education classification
* Use formats to get a meaningful code for education classification; AFSP1E
* AFSP1E: first 2 digits=main group, next 2=intermediate group, next 2=lower group, last 2=elementary
* Formats: use most recent formats (2014) - format dataset "uddan_2014_audd"
* AFSP1E renamed "educcode"

g educ1=substr(educcode,1,2) // Main
g educ2=substr(educcode,1,4) // Intermediate
g educ3=substr(educcode,1,6) // Lower

* Dummies for each group
tab educ1, g(educ1_d)
qui tab educ2, g(educ2_d)
qui tab educ3, g(educ3_d)

codebook educ1 // 9 groups
codebook educ2 // 72 groups
codebook educ3 // 410 groups
codebook educcode // 1818 unique codes
* Note: these groups correspond to 2011-formats



/***** Save final data with occupation and education fixed effects *****/
* Keep identifier for occupation 1st-digit group, past and future 1st-digit groups + year from which it is taken, and identifiers for education groups

keep pnr year ocp miss_ocp ocp1 pastocp1 yearpastocp1 futureocp1 yearfutureocp1 educcode educ1 educ2 educ3

destring ocp1 pastocp1 futureocp1, replace 
destring educ1 educ2 educ3, replace

keep if year>=2008 & year<=2012

save occupation_education_20082012, replace 








/***** Industry fixed effects *****/

/*** Collect industry-codes back to 2003, make codes consistent over time, merge on formats ***/
* Use previous years' codes to impute missings/unspecified codes in 2008-2012 data

*** Raw data
* IDAN registry: only people who are "active" in the labor market
use idan19802013, clear

rename aar year
keep if year>=2003 & year<=2012
gen use=1
replace use=0 if persbrc=="     ."
replace use=0 if persbrc==""
tab year use, row
gsort pnr year -use -ansdage
duplicates drop pnr year, force // Keep unique pnr+year for non-missing industry with highest # of hours worked
keep pnr year persbrc

* Merge with population data to get indicator for those not in IDAN 
sort pnr year
merge 1:1 pnr year using pnr_20082012
keep pnr year persbrc _merge
tab year _merge, row
keep if year>=2003 & year<=2012
keep if _merge==3 | (_merge==2 & year>=2008) | (_merge==1 & year<=2007)
gen mis_work=0 
replace mis_work=1 if _merge==2 & year>=2008 // Dummy for not in IDAN 2008-2012
drop _merge

* All variables small letters
foreach var of varlist _all {
rename `var' `=lower("`var'") '
}

* Rename and label variables
rename persbrc industry
lab var industry "industry"

* Keep variables
keep pnr year industry mis_work


*** Industry codes
* Convert 2003-2006 definitions (DB03 codes) to 2007 definitions (DB07 codes)
gen db03=industry if year<=2007 
tab year if db03!=""

sort db03
merge m:1 db03 using DB03_to_DB07_unique
tab _merge
tab _merge if year<=2007
tab _merge if db03!=""
tab year _merge, row
drop if _merge==2
rename _merge _mergedb07

replace db07=industry if year>=2008 & year!=.
rename db07 industry_newfmt

drop db03 industry // Keep pnr, year, industry_newf

* Add industry formats (DB07)

rename industry_newfmt DB07

sort DB07 
merge m:1 DB07 using db07fmts
tab _merge
drop if _merge==2
tab DB07 if _merge==1 // Only industry=. for not-matched DB07 industry codes
tab year if DB07=="     ."
tab year if DB07==""
tab year if DB07=="" & mis_work==0
rename _merge _mergefmt

rename DB07 industry_newfmt

* Keep industry code unchanged, remember that code 999999 means "unspecified"
* Do not use industry=999999 or missing when creating 1st digit and 2nd digit groups
* Check that no og unique industry codes, 1st digit groups and 2nd digit groups are the same all years 2003-2011

egen ind_1 = group(GRP5) if GRP5!="11" & GRP5!=""
egen ind_2 = group(GRP4) if GRP4!="X" & GRP4!=""
egen ind_3=group(GRP3) if GRP3!="X" & GRP3!=""
tab ind_1 // 10 groups
tab ind_2 // 19 groups
tab ind_3 // 36 groups
lab var ind_1 "Industry 1st level"
lab var ind_2 "Industry 2nd level"
lab var ind_3 "Industry 3rd level"

* Missings and unspecified
tab year if industry_newfmt==""
tab year if industry_newfmt=="     ."
tab year if industry_newfmt=="999999"
replace industry_newfmt="" if industry_newfmt=="     ."
tab year if industry_newfmt==""

tab year if ind_1==.
tab year if ind_2==.
tab year if ind_3==.

keep pnr year industry_newfmt ind_1 ind_2 ind_3 mis_work

* Get previous and future industry for people with missing or unspecified code
tab year mis_work, row // Not in IDAN data
gen mis=0
replace mis=1 if industry_newfmt=="" // In IDAN but missing
tab year mis, row
gen unspec=0
replace unspec=1 if industry_newfmt=="999999" // In IDAN but unspecified
tab year unspec, row
gen ok=0
replace ok=1 if mis==0 & unspec==0 & mis_work==0
tab year ok, row

* Most recent industry in 2008, 2009, 2010, 2011, and 2012
forval i=2008/2012 {
gen ok`i'=ok
replace ok`i'=0 if year>=`i'
gsort pnr -ok`i' -year 
by pnr: gen count=_n if year<=`i'-1 & ok`i'==1
gen pastind`i'=industry_newfmt if ok`i'==1 & count==1
bysort pnr (pastind`i'): replace pastind`i'=pastind`i'[_N] // Most recent available industry
gen temp=ind_1 if ok`i'==1 & count==1
bysort pnr: egen pastind1_`i'=sum(temp)
drop temp
gen temp=ind_2 if ok`i'==1 & count==1
bysort pnr: egen pastind2_`i'=sum(temp)
drop temp
gen temp=ind_3 if ok`i'==1 & count==1
bysort pnr: egen pastind3_`i'=sum(temp)
drop temp
gen temp=year if ok`i'==1 & count==1
bysort pnr: egen yearpastind`i'=total(temp) // Year from which most recent industry info is taken
drop ok`i' count temp
replace pastind1_`i'=. if pastind1_`i'==0
replace pastind2_`i'=. if pastind2_`i'==0
replace pastind3_`i'=. if pastind3_`i'==0
replace yearpastind`i'=. if yearpastind`i'==0
}

forval i=2008/2012 {
tab pastind1_`i'
tab pastind2_`i'
tab pastind3_`i'
}

* Nearest future industry in 2008, 2009, 2010, and 2011
forval i=2008/2011 {
gen ok`i'=ok
replace ok`i'=0 if year<=`i'
gsort pnr -ok`i' year 
by pnr: gen count=_n if year>=`i'+1 & ok`i'==1
gen futureind`i'=industry_newfmt if ok`i'==1 & count==1
bysort pnr (futureind`i'): replace futureind`i'=futureind`i'[_N] 
gen temp=ind_1 if ok`i'==1 & count==1
bysort pnr: egen futureind1_`i'=total(temp)
drop temp
gen temp=ind_2 if ok`i'==1 & count==1
bysort pnr: egen futureind2_`i'=total(temp)
drop temp
gen temp=ind_3 if ok`i'==1 & count==1
bysort pnr: egen futureind3_`i'=total(temp)
drop temp
gen temp=year if ok`i'==1 & count==1
bysort pnr: egen yearfutureind`i'=total(temp) // Year from which nearest future industry info is taken
drop ok`i' count temp
replace futureind1_`i'=. if futureind1_`i'==0
replace futureind2_`i'=. if futureind2_`i'==0
replace futureind3_`i'=. if futureind3_`i'==0
replace yearfutureind`i'=. if yearfutureind`i'==0
}


* Make one variables for past/future industry 
gen pastind=""
gen pastind_1=.
gen pastind_2=.
gen pastind_3=.
gen yearpastind=.
gen futureind=""
gen futureind_1=.
gen futureind_2=.
gen futureind_3=.
gen yearfutureind=.

forval i=2008/2012 {
replace pastind=pastind`i' if year==`i'
replace pastind_1=pastind1_`i' if year==`i'
replace pastind_2=pastind2_`i' if year==`i'
replace pastind_3=pastind3_`i' if year==`i'
replace yearpastind=yearpastind`i' if year==`i'
}

forval i=2008/2011 {
replace futureind=futureind`i' if year==`i'
replace futureind_1=futureind1_`i' if year==`i'
replace futureind_2=futureind2_`i' if year==`i'
replace futureind_3=futureind3_`i' if year==`i'
replace yearfutureind=yearfutureind`i' if year==`i'
}

* Stats: How many people have some industry info - from current year, past or future
gen ok_2=0
replace ok_2=1 if (industry_newfmt!="" & industry_newfmt!="999999") | (pastind!="" & pastind!="999999") | (futureind!="" & futureind!="999999")
tab year ok, row
tab year ok_2, row
tab year if ind_1!=. | pastind_1!=. | futureind_1!=.
tab year if ind_2!=. | pastind_2!=. | futureind_2!=.
tab year if ind_3!=. | pastind_3!=. | futureind_3!=.

* Keep relevant variables for the years 2008-2012
keep year pnr industry_newfmt ind_1 ind_2 ind_3 mis_work mis unspec pastind pastind_1 pastind_2 pastind_3 yearpastind futureind futureind_1 futureind_2 futureind_3 yearfutureind
keep if year>=2008 & year<=2012

* Save final data
save industry_20082012, replace



















